We are provided with a dataset of Neutritional Facts on which we are performing necessary cleaning, EDA and building various Machine learning Models. Python Libraries used for this project are-
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
import plotly
import plotly.express as px
from scipy.stats import chi2,chi2_contingency
import scipy.stats as stats
from sklearn import preprocessing
import statsmodels.formula.api as smf
from sklearn.preprocessing import MinMaxScaler
from autoviz.AutoViz_Class import AutoViz_Class
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from yellowbrick.cluster import KElbowVisualizer
from yellowbrick.cluster import SilhouetteVisualizer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import sklearn.metrics as metric
from sklearn.cluster import KMeans
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
neutrients=pd.read_csv("neutrional facts.csv")
neutrients.head()
neutrients.info()
neutrients.describe().T
unique_categories=neutrients["Category"].unique()
print("Number of Unique categories =",len(unique_categories))
neutrients["Category"].value_counts()
barchart=px.bar(data_frame=neutrients,x="Category",opacity=0.9,orientation="v",color="Category")
barchart.show()
missing_value_columns=[]
for i in neutrients:
for x in ["?"," ","-","Nan","v"]:
if x in list(neutrients[i]):
missing_value_columns.append(i)
break
for i in missing_value_columns:
print(i)
for i in missing_value_columns:
neutrients[i].replace(['?',' ','-','Nan','v'], np.nan, inplace=True)
neutrients.isnull().sum()
for i in neutrients.columns:
if i not in ['Category',"Item","Serving Size"]:
neutrients[i]=pd.to_numeric(neutrients[i])
neutrients.info()
def extract(x):
x=x.split()
oz=x[0]
return(float(oz))
neutrients["Serving Size"]=neutrients["Serving Size"].apply(extract)
neutrients["Serving Size"]
corr_df=neutrients.corr()
plt.figure(figsize=(15,15))
sns.heatmap(corr_df,cmap="YlGnBu",annot=True)
values_highly_corrlated={"Calories":["Protein","Carbohydrates (% Daily Value)",'Carbohydrates','Saturated Fat', 'Total Fat','Total Fat (% Daily Value)'],"Calories from Fat":['Total Fat', 'Total Fat (% Daily Value)','Saturated Fat','Sodium (% Daily Value)','Protein'],"Saturated Fat (% Daily Value)":['Total Fat', 'Total Fat (% Daily Value)','Saturated Fat'],"Sodium":['Total Fat', 'Total Fat (% Daily Value)','Sodium (% Daily Value)'],'Vitamin A (% Daily Value)':['Trans Fat', 'Cholesterol','Carbohydrates', 'Carbohydrates (% Daily Value)'],'Iron (% Daily Value)':['Total Fat', 'Total Fat (% Daily Value)','Sodium (% Daily Value)','Dietary Fiber', 'Dietary Fiber (% Daily Value)']}
prediction_imputer={}
location_of_null={}
a=0
for i in values_highly_corrlated:
y=i
x=values_highly_corrlated[i]
location_of_null[y]=neutrients.loc[neutrients[y].isnull(),y].index
neutrients[y].fillna(0,inplace=True)
model_imputer=LinearRegression()
model_imputer.fit(neutrients[x],neutrients[y])
prediction_imputer[y]=model_imputer.predict(neutrients[x])
print("Accuracy for predicting",y,model_imputer.score(neutrients[x],neutrients[y]))
a=a+1
. </font>
print(location_of_null)
for i in location_of_null:
neutrients[i].iloc[location_of_null[i]]=prediction_imputer[i][location_of_null[i]]
neutrients.isnull().sum()
fig = px.box(pd.melt(neutrients), x="variable", y="value", points="outliers")
fig.show()
for col in neutrients.drop(columns=["Item","Category","Trans Fat"]):
# Calculating values of Q1, Q3 and IQR
Q1 = neutrients.describe()[col].loc['25%']
Q3 = neutrients.describe()[col].loc['75%']
IQR = Q3 - Q1
neutrients[col].where( ~((neutrients[col] < (Q1 - 1.5 * IQR)) | (neutrients[col] > (Q3 + 1.5 * IQR))), np.NaN, inplace=True )
neutrients.fillna(neutrients[col].median(), inplace=True)
fig = px.box(pd.melt(neutrients), x="variable", y="value", points="outliers")
fig.show()
two_way_table=pd.crosstab(columns=neutrients["Category"],index=neutrients["Item"])
two_way_table
STEP 1 Defining Null and Alternate hypothesis
STEP 2 Selecting a significance level
STEP 3 Identification of degree of freedom
STEP 4
critical_value=chi2.ppf(1-.05,2072)
critical_value
STEP 5
chi2_contingency(two_way_table)
In output of chi2_contingency above
barchart=px.bar(data_frame=neutrients,x="Category",opacity=0.9,orientation="v",color="Item")
barchart.show()
le=preprocessing.LabelEncoder()
neutrients["label_Encoded_Categories"]=le.fit_transform(neutrients["Category"])
neutrients["Total_fat"]=neutrients["Total Fat"]
poly_model=smf.ols(formula="label_Encoded_Categories~Total_fat",data=neutrients).fit()
poly_model.summary()
barchart=px.bar(data_frame=neutrients,x="Category",opacity=0.9,orientation="v",color="Total Fat")
barchart.show()
- Firstly we are using label encoder to encode Items
le=preprocessing.LabelEncoder()
neutrients["label_Encoded_Items"]=le.fit_transform(neutrients["Item"])
poly_model=smf.ols(formula="label_Encoded_Items~Protein",data=neutrients).fit()
poly_model.summary()
poly_model=smf.ols(formula='label_Encoded_Items ~ Sugars',data=neutrients).fit()
poly_model.summary()
neutrients
by_category = pd.DataFrame({'Category':neutrients['Category'].value_counts().index.tolist(),
'Count':neutrients['Category'].value_counts().tolist(),
'Calories (Sum)':neutrients[['Category','Calories']].groupby('Category').sum()['Calories'],
'Calories (Mean)':neutrients[['Category','Calories']].groupby('Category').mean()['Calories']})
by_category.reset_index(drop = True)
fig,(ax1,ax2) = plt.subplots(1,2)
fig.set_size_inches(15.5, 7.5)
ax1.set_title("Total of Items in Each Category")
ax1.pie(by_category['Count'],labels =by_category['Category'],autopct = '%1.1f%%')
ax2.set_title("Total of Calories in Each Category")
ax2.pie(by_category['Calories (Sum)'],labels =by_category['Category'],autopct = '%1.1f%%')
fig.suptitle('Calories by Category',fontsize = 20)
fig.legend(by_category['Category'],ncol=4,loc=8)
SmoothShakes = neutrients[neutrients["Category"] == "Smoothies & Shakes"]
SmoothShakes = SmoothShakes.sort_values("Calories", ascending = False)
barchart=px.bar(data_frame=SmoothShakes,x="Item",y="Calories",opacity=0.9,orientation="v",color="Calories")
barchart.show()
Chicken_and_fish= neutrients[neutrients["Category"] == "Chicken & Fish"]
Chicken_and_fish = Chicken_and_fish.sort_values("Calories", ascending = False)
barchart=px.bar(data_frame=Chicken_and_fish,x="Item",y="Calories",opacity=0.9,orientation="v",color="Calories")
barchart.show()
Beverages= neutrients[neutrients["Category"] == "Beverages"]
Beverages = Beverages.sort_values("Calories", ascending = False)
barchart=px.bar(data_frame=Beverages,x="Item",y="Calories",opacity=0.9,orientation="v",color="Calories")
barchart.show()
Beef_and_Pork= neutrients[neutrients["Category"] == "Beef & Pork"]
Beef_and_Pork =Beef_and_Pork.sort_values("Calories", ascending = False)
barchart=px.bar(data_frame=Beef_and_Pork,x="Item",y="Calories",opacity=0.9,orientation="v",color="Calories")
barchart.show()
coffee_and_tea = neutrients[neutrients["Category"] == "Coffee & Tea"]
coffee_and_tea = coffee_and_tea.sort_values("Calories", ascending = False)
barchart=px.bar(data_frame=coffee_and_tea,x="Item",y="Calories",opacity=0.9,orientation="v",color="Calories")
barchart.show()
Salads = neutrients[neutrients["Category"] == "Salads"]
Salads = Salads.sort_values("Calories", ascending = False)
barchart=px.bar(data_frame=Salads,x="Item",y="Calories",opacity=0.9,orientation="v",color="Calories")
barchart.show()
Snacks_and_Sides = neutrients[neutrients["Category"] == "Snacks & Sides"]
Snacks_and_Sides = Snacks_and_Sides.sort_values("Calories", ascending = False)
barchart=px.bar(data_frame=Snacks_and_Sides,x="Item",y="Calories",opacity=0.9,orientation="v",color="Calories")
barchart.show()
Desserts = neutrients[neutrients["Category"] == "Desserts"]
Desserts = Desserts.sort_values("Calories", ascending = False)
barchart=px.bar(data_frame=Desserts,x="Item",y="Calories",opacity=0.9,orientation="v",color="Calories")
barchart.show()
Breakfast = neutrients[neutrients["Category"] == "Breakfast"]
Breakfast = Breakfast.sort_values("Calories", ascending = False)
barchart=px.bar(data_frame=Breakfast,x="Item",y="Calories",opacity=0.9,orientation="v",color="Calories")
barchart.show()
SmoothShakes = neutrients[neutrients["Category"] == "Smoothies & Shakes"]
Beverages= neutrients[neutrients["Category"] == "Beverages"]
coffe_and_tea = neutrients[neutrients["Category"] == "Coffee & Tea"]
drinks=pd.concat([SmoothShakes,Beverages,coffe_and_tea])
drinks
drink_group_serving_size=drinks.groupby("Serving Size").sum()
drink_group_serving_size
px.scatter(x=drink_group_serving_size.index,y=drink_group_serving_size["Calories"])
Breakfast = neutrients[neutrients["Category"] == "Breakfast"]
Beef_and_Pork= neutrients[neutrients["Category"] == "Beef & Pork"]
Chicken_and_fish= neutrients[neutrients["Category"] == "Chicken & Fish"]
Salads = neutrients[neutrients["Category"] == "Salads"]
Snacks_and_Sides = neutrients[neutrients["Category"] == "Snacks & Sides"]
Desserts = neutrients[neutrients["Category"] == "Desserts"]
food=pd.concat([Breakfast,Beef_and_Pork,Chicken_and_fish,Salads,Snacks_and_Sides,Desserts])
food_group_serving_size=food.groupby("Serving Size").sum()
px.scatter(x=food_group_serving_size.index,y=food_group_serving_size["Calories"])
neutrients_group_serving_size=neutrients.groupby("Serving Size").sum()
px.scatter(x=neutrients_group_serving_size.index,y=neutrients_group_serving_size["Calories"])
#Normality test
#H0= The sample comes from a normal distribution.
#HA= The sample is not coming from a normal distribution.
Normality_df=pd.DataFrame()
dictionary={}
import scipy.stats as stats
for i in neutrients.columns.drop(["Category","Item"]):
a,b=stats.shapiro(neutrients[i])
if b<=.05:
c="The null hypothesis can be rejected"
else:
c="Null hypothesis cannot be rejected"
dictionary[i]=[a,b,c]
Normality_df=pd.DataFrame(dictionary,index=["test-statistic","p-value","Conclusion"])
Normality_df.T
#Normality test
#H0= The sample comes from a normal distribution.
#HA= The sample is not coming from a normal distribution.
Normality_df=pd.DataFrame()
dictionary={}
import scipy.stats as stats
for i in neutrients.columns.drop(["Category","Item"]):
a,b=stats.kstest(neutrients[i],'norm')
if b<=.05:
c="The null hypothesis can be rejected"
else:
c="Null hypothesis cannot be rejected"
dictionary[i]=[a,b,c]
Normality_df=pd.DataFrame(dictionary,index=["test-statistic","p-value","Conclusion"])
Normality_df.T
import seaborn as sns
from scipy.stats import norm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df =neutrients.drop(["Category","Item","Trans Fat"],axis=1)
list_columns=df.columns
fig, ax = plt.subplots(figsize = (12, 10))
for i, col in enumerate(df.columns):
plt.subplot(5, 5, i+1)
sns.distplot(df.iloc[:,i],fit=norm, kde=False,ax=plt.gca())
#stats.probplot(df.iloc[:,i],dist="norm",plot=plt)
plt.title(list_columns[i])
#plt.axis('off')
plt.tight_layout()
import seaborn as sns
from scipy.stats import norm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df =neutrients.drop(["Category","Item","Trans Fat"],axis=1)
list_columns=df.columns
fig, ax = plt.subplots(figsize = (15, 15))
for i, col in enumerate(df.columns):
plt.subplot(5, 5, i+1)
stats.probplot(df.iloc[:,i],dist="norm",plot=plt)
plt.title(list_columns[i])
#plt.axis('off')
plt.tight_layout()
AV = AutoViz_Class()
AV.AutoViz('neutrional facts.csv')
AutoViz() provides the visualization of a dataset similar to sweetviz
It provides
1) sum of columns based on their datatype
2) Pairwise plots of Variables
3) Distplots, Boxplots and Pobability plots
4) Violin plots of all Continuous variables
5) Heatmap of all Continuous variables
6) Barplots of continuous variables based on categories
Pre-processing Steps
Finding K for KMeans using elbow method
Predicting Clusters
Displaying the clusters
Verifying the model using Silhouette
Verifying the model using calinski_harabasz
neutrient_copy=neutrients.copy()
corr=neutrient_copy.corr(method="spearman")
plt.figure(figsize=(20,20))
sns.heatmap(corr,cmap="YlGnBu",annot=True)
Checking For potential duplicates
cor_matrix = corr.abs()
upper_triu = cor_matrix.where(np.triu(np.ones(cor_matrix.shape), k=1).astype(np.bool))
to_drop = [column for column in upper_triu.columns if any(upper_triu[column]>0.80) ]
print(to_drop)
Dropping columns in above cell
neutrient_copy.drop(to_drop,axis=1,inplace=True)
Dropping Category and Item because we had label encoded them previously
neutrient_copy.drop(["Category","Item"],axis=1,inplace=True)
scaled_values=neutrient_copy[['label_Encoded_Categories', 'label_Encoded_Items', 'Serving Size', 'Calories', 'Trans Fat',
'Carbohydrates', 'Dietary Fiber', 'Sugars', 'Vitamin A (% Daily Value)',
'Vitamin C (% Daily Value)']]
# finding the mean and std deviation of numerical columns
scaler = MinMaxScaler().fit(scaled_values)
# scaling columns to a common range
data_normalized = scaler.transform(scaled_values)# Scaled data
normalized_neutrients=pd.DataFrame(data_normalized,index=neutrient_copy.index,columns=scaled_values.columns)
normalized_neutrients.head()
X=normalized_neutrients.iloc[:,2:]
X
Inference from above
sse = []
k_rng = range(1,10)
for k in k_rng:
km = KMeans(n_clusters=k)
km.fit(X)
sse.append(km.inertia_)
plt.xlabel('K')
plt.ylabel('Sum of squared error')
plt.plot(k_rng,sse)
From above graph we can observe that our elbow is at 3 and 2 but since error is less at 3 we are taking K as 3
km = KMeans(n_clusters=3)
y_predicted = km.fit_predict(X)
y_predicted
cluster centers
km.cluster_centers_
Adding Cluster column to dataframe
normalized_neutrients['cluster']=y_predicted
normalized_neutrients.head()
Inference from above step
Since we took K as 3 we are making 3 clusters
cluster0=normalized_neutrients[normalized_neutrients["cluster"]==0].iloc[:,1:]
cluster0
df=normalized_neutrients
df1 = df[df.cluster==0]
scalar = StandardScaler()
# fitting
scalar.fit(df1)
scaled_data = scalar.transform(df1)
# Let's say, components = 2
pca = PCA(n_components = 2)
pca.fit(scaled_data)
x_pca = pca.transform(scaled_data)
# giving a larger plot
plt.figure(figsize =(8, 6))
plt.scatter(x_pca[:, 0], x_pca[:, 1], c = df1['cluster'],cmap="viridis")
# labeling x and y axes
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
cluster1=normalized_neutrients[normalized_neutrients["cluster"]==1].iloc[:,1:]
cluster1
df=normalized_neutrients
df2 = df[df.cluster==1]
scalar = StandardScaler()
# fitting
scalar.fit(df2)
scaled_data = scalar.transform(df2)
# Let's say, components = 2
pca = PCA(n_components = 2)
pca.fit(scaled_data)
x_pca = pca.transform(scaled_data)
# giving a larger plot
plt.figure(figsize =(8, 6))
plt.scatter(x_pca[:, 0], x_pca[:, 1], c = df2['cluster'], cmap ='inferno')
# labeling x and y axes
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
cluster2=normalized_neutrients[normalized_neutrients["cluster"]==2].iloc[:,2:]
cluster2
df=normalized_neutrients
df3 = df[df.cluster==2]
scalar = StandardScaler()
# fitting
scalar.fit(df3)
scaled_data = scalar.transform(df3)
# Let's say, components = 2
pca = PCA(n_components = 2)
pca.fit(scaled_data)
x_pca = pca.transform(scaled_data)
# giving a larger plot
plt.figure(figsize =(8, 6))
plt.scatter(x_pca[:, 0], x_pca[:, 1], c = df3['cluster'], cmap ='plasma')
# labeling x and y axes
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
scalar = StandardScaler()
# fitting
scalar.fit(normalized_neutrients)
scaled_data = scalar.transform(normalized_neutrients)
# Let's say, components = 2
pca = PCA(n_components = 2)
pca.fit(scaled_data)
x_pca = pca.transform(scaled_data)
# giving a larger plot
plt.figure(figsize =(8, 6))
plt.scatter(x_pca[:, 0], x_pca[:, 1], c = normalized_neutrients['cluster'], cmap ='plasma')
# labeling x and y axes
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
#Double check by the Silhouette score
# Instantiate the clustering model and visualizer
model = KMeans()
visualizer = KElbowVisualizer(
model, k=(2,12), metric='silhouette', timings=False
)
visualizer.fit(X) # Fit the data to the visualizer
visualizer.show()
metric.calinski_harabasz_score(X, km.labels_)
visualizer = KElbowVisualizer(km, k=(2,12), metric='calinski_harabasz', timings=True)
visualizer.fit(X)
visualizer.show()
Inference from above
Selecting the predictors
Splitting the data
Creating the model
Validating Model
Checking Model Summary
plt.figure(figsize=(25,25))
#sns.pairplot(neutrients)
sns.pairplot(x_vars=['Serving Size', 'Calories', 'Calories from Fat',
'Total Fat', 'Total Fat (% Daily Value)', 'Saturated Fat',
'Saturated Fat (% Daily Value)'],y_vars=["Calories"],data=neutrients)
sns.pairplot(x_vars=['Trans Fat', 'Cholesterol',
'Cholesterol (% Daily Value)', 'Sodium', 'Sodium (% Daily Value)',
'Carbohydrates', 'Carbohydrates (% Daily Value)'],y_vars="Calories",data=neutrients)
sns.pairplot(x_vars=['Dietary Fiber',
'Dietary Fiber (% Daily Value)', 'Sugars', 'Protein',
'Vitamin A (% Daily Value)', 'Vitamin C (% Daily Value)',
'Calcium (% Daily Value)'],y_vars="Calories",data=neutrients)
sns.pairplot(x_vars=['Iron (% Daily Value)',
'label_Encoded_Categories', 'Total_fat', 'label_Encoded_Items'],y_vars="Calories",data=neutrients)
plt.figure(figsize=(20,20))
sns.heatmap(neutrients.corr(method="spearman"),cmap="YlGnBu",annot=True)
By Analysing the heat map and various pairplots we reach at a conclusion that following variables hold a linear relationship with calories
So we are considering above variables as predictors
features=neutrients[["Saturated Fat","Protein","Cholesterol","Trans Fat","Total Fat"]]
target=neutrients["Calories"]
X_train,X_test,Y_train,Y_test=train_test_split(features,target,test_size=.35,random_state=100)
For Splitting we are considering 65% as train data and 35% as test data
model=LinearRegression()
model.fit(X_train,Y_train)
Checking the Coeff and Intercept values
print("Coefficient :",model.coef_)
print("Intercept :",model.intercept_)
Checking Accuracy of model with training and testing data
print("Training Accuracy =",model.score(X_train,Y_train))
print("Testing Accuracy =",model.score(X_test,Y_test))
prediction=model.predict(features)
sns.residplot(prediction.reshape(-1),neutrients["Calories"],lowess=True)
from scipy import stats
residual_error=neutrients['Calories']-prediction.reshape(-1)
plt.figure(figsize=(7,7))
stats.probplot(residual_error, dist='norm', plot=plt)
plt.title("Normal Q-Q Plot")
For the above data set, the Q-Q plot of the residuals of the best fit model (shown below) indicates the adherence to normality assumption since the residual points are close to the normal line.
sd=np.sqrt(np.sum(np.square(residual_error))/(residual_error.size-2))
# standardised residuals
sd_error=residual_error/sd
sq_abs_sd_err=np.sqrt(np.abs(sd_error))
plt.figure(figsize=(7,7))
sns.regplot(prediction.reshape(-1), sq_abs_sd_err,
scatter=True,
lowess=True,
line_kws={'color': 'red', 'lw': 1, 'alpha': 0.8})
plt.ylabel("Standarized residuals")
plt.xlabel("Fitted value")
The linear regression model is said to abide by the homoscedasticity assumption if there is no specific pattern observed in the scale-location plot. The scale-location plot of the best fit model for the computer repairs data set is as shown below.
from statsmodels.stats.outliers_influence import variance_inflation_factor
#calculating the VIF for each attributes
vif = pd.Series([variance_inflation_factor(features.values,idx)
for idx in range(features.shape[1])],
index=features.columns)
print(vif)
From above we can say predictors are not highly correlated
import statsmodels.api as sm
x=sm.add_constant(features)
y=target
model_summ=sm.OLS(y,x).fit()
model_summ.summary()